Libraries

Install R packages if needed.

# Required packages
required_packages <- c(
    "rmarkdown",
    "bookdown",
    "knitr",
    "lubridate",
    "tidyverse",
    "purrr",
    "glue",
    "lubridate",
    "sf",
    "tmap",
    "leaflet",
    "leaflet.extras"
)

# Try to install packages if not installed
default_options <- options()
tryCatch(
    {
        # Disable interactivity
        options(install.packages.compile.from.source = "always")
        
        # Install package if not installed
        for (package in required_packages) {
            is_package_installed <- require(package, character.only = TRUE)
            if (!is_package_installed) {
                cat(paste0("Installing package: ", package, "\n"))
                install.packages(package)
            } else {
                cat(paste0("Package already installed: ", package, "\n"))
            }
        }
    },
    error = function(cond) {
        stop(cond)
    },
    finally = {
        options(default_options) # reset interactivity
    }
)

Load R libraries.

library(ggplot2)
library(glue)
library(leaflet)
library(leaflet.extras)
library(lubridate)
library(sf)
library(tidyverse)
library(tmap)

Data

  • vancksi_raw1: is the file received from Brice Batomen on July 3, 2024 via email, which was originally obtained directly from ICBC
  • vancksi_raw2: is the file downloaded here with the documentation provided at ICBC’s Tableau website here on September 13, 2024
vancksi_raw1 <- read.table("../../tmp/vanc-test.dat", sep = "|", header = T)
vancksi_raw2 <- read_delim("../../tmp/vanc-ksi-lowermainland-2024-09-13.csv", delim = "\t", locale = locale(encoding="UTF-16"))
vancbounds_raw <- read_sf("../../tmp/vanc-localareabound-2024-09-13.geojson")

ICBC (Direct)

vancksi_raw1 %>% head

ICBC (Tableau)

vancksi_raw2 %>% head

Vancouver Bounds

Use the local area boundary as the City of Vancouver boundaries obtained from the open data portal here.

vancbounds_raw %>% head

Cleaning

Vancouver Bounds

Buffer boundaries by 100 meters to capture errorneous points.

vancbounds <- vancbounds_raw %>% st_buffer(100)

ICBC (Direct)

Convert to Spatial Data

vancksi1 <- st_as_sf(
    vancksi_raw1 %>% filter(!is.na(LATITUDE) & !is.na(LONGITUDE)),
    coords = c("LONGITUDE", "LATITUDE"),
    crs = 4326
)

There are missing coordinates in the data.

cat(
    "\nICBC (Direct) Missing Coordinates: ", vancksi_raw1 %>% filter(is.na(LATITUDE) | is.na(LONGITUDE)) %>% nrow,
    "\nICBC (Direct) Missing Longitudes: ", vancksi_raw1 %>% filter(is.na(LONGITUDE)) %>% nrow,
    "\nICBC (Direct) Missing Latitudes: ", vancksi_raw1 %>% filter(is.na(LATITUDE)) %>% nrow
)
## 
## ICBC (Direct) Missing Coordinates:  4617 
## ICBC (Direct) Missing Longitudes:  4617 
## ICBC (Direct) Missing Latitudes:  4617

Crop to City Bounds

before_crop <- vancksi1 %>% nrow
vancksi1 <- vancksi1 %>%
    st_crop(vancbounds)
after_crop <- vancksi1 %>% nrow
cat("Rows cropped: ", before_crop - after_crop)
## Rows cropped:  1284

Killed or Serious Injuries Only

Injuries only (Fatal injury, Serious injury - Overnight at hospital) for column INJURY_TYPE:

vancksi1 <- vancksi1 %>%
    filter(INJURY_TYPE %in% c(
        "Fatal injury",
        "Serious injury - Overnight at hospital"
    ))

Preview

vancksi1 %>% as_tibble %>% select(-geometry) %>% head

Details

vancksi_raw1 %>%
    group_by(INJURY_TYPE) %>%
    count %>%
    mutate(
        perc = n / nrow(vancksi_raw1) * 100
    )

ICBC (Tableau)

Convert to Spatial Data

vancksi2 <- st_as_sf(
    vancksi_raw2 %>% filter(!is.na(Latitude) & !is.na(Longitude)),
    coords = c("Longitude", "Latitude"),
    crs = 4326
)

There are missing coordinates in the data.

cat(
    "\nICBC (Tableau) Missing Coordinates: ", vancksi_raw2 %>% filter(is.na(Latitude) | is.na(Longitude)) %>% nrow,
    "\nICBC (Tableau) Missing Longitudes: ", vancksi_raw2 %>% filter(is.na(Longitude)) %>% nrow,
    "\nICBC (Tableau) Missing Latitudes: ", vancksi_raw2 %>% filter(is.na(Latitude)) %>% nrow
)
## 
## ICBC (Tableau) Missing Coordinates:  106871 
## ICBC (Tableau) Missing Longitudes:  106871 
## ICBC (Tableau) Missing Latitudes:  106871

Crop to City Bounds

before_crop <- vancksi2 %>% nrow
vancksi2 <- vancksi2 %>%
    st_crop(st_bbox(vancbounds))
after_crop <- vancksi2 %>% nrow
cat("Rows cropped: ", before_crop - after_crop)
## Rows cropped:  572000

Killed or Serious Injuries Only

Only collisions with CASUALTY CRASH according to the data dictionary here for column Crash Severity:

vancksi2 <- vancksi2 %>%
    filter(`Crash Severity` == "CASUALTY CRASH")

Preview

vancksi2 %>% as_tibble %>% select(-geometry) %>% head

Details

vancksi_raw2 %>%
    group_by(`Crash Severity`) %>%
    count %>%
    mutate(
        perc = n / nrow(vancksi_raw2) * 100
    )

Exploration

Column Differences

The Tableau data has 11 more columns that the direct data with more temporal details.

Details

cat(
    "\nColumns in ICBC (Direct): ", ncol(vancksi1),
    "\nColumns in ICBC (Tableau): ", ncol(vancksi2),
    "\nColumns difference: ", ncol(vancksi2) - ncol(vancksi1)
)
## 
## Columns in ICBC (Direct):  17 
## Columns in ICBC (Tableau):  28 
## Columns difference:  11

Columns

Time, day, month, and year are all given in the Tableau data, while the direct data only contains the month and year.

ICBC (Direct):

vancksi1 %>% colnames
##  [1] "REGION"                "MUNICIPALITY_NAME"     "YEAR"                 
##  [4] "MONTH"                 "HIT_AND_RUN_INDICATOR" "CRASH_CONFIGURATION"  
##  [7] "LAND_USE"              "LIGHT"                 "IN_PARKING_LOT"       
## [10] "AGE_RANGE"             "GENDER"                "PASSENGER_POSITION"   
## [13] "ROLE"                  "INJURY_TYPE"           "SAFETY_EQUIPMENT"     
## [16] "VICTIM_COUNT"          "geometry"

ICBC (Tableau):

vancksi2 %>% colnames
##  [1] "Crash Breakdown 2"           "Date Of Loss Year"          
##  [3] "Animal Flag"                 "Crash Severity"             
##  [5] "Cyclist Flag"                "Day Of Week"                
##  [7] "Derived Crash Configuration" "Heavy Vehicle Flag"         
##  [9] "Intersection Crash"          "Month Of Year"              
## [11] "Motorcycle Flag"             "Municipality Name (ifnull)" 
## [13] "Parked Vehicle Flag"         "Parking Lot Flag"           
## [15] "Pedestrian Flag"             "Region"                     
## [17] "Street Full Name (ifnull)"   "Time Category"              
## [19] "Municipality Name"           "Road Location Description"  
## [21] "Street Full Name"            "Metric Selector"            
## [23] "Total Crashes"               "Total Victims"              
## [25] "Cross Street Full Name"      "Mid Block Crash"            
## [27] "Municipality With Boundary"  "geometry"

Row Differences

The Tableau data has more records than the direct data.

Details

cat(
    "\nRows in ICBC (Direct): ", nrow(vancksi1),
    "\nRows in ICBC (Tableau): ", nrow(vancksi2),
    "\nRows difference: ", nrow(vancksi2) - nrow(vancksi1)
)
## 
## Rows in ICBC (Direct):  1834 
## Rows in ICBC (Tableau):  41112 
## Rows difference:  39278

Rows

ICBC (Direct):

vancksi1 %>% as_tibble %>% select(-geometry) %>% head

ICBC (Tableau):

vancksi2 %>% as_tibble %>% select(-geometry) %>% head

Temporal

The direct data covers a longer time span (2010 to 2021) than the tableau data (2019 to 2023).

cat(
    "\nICBC (Direct) Years: ", min(vancksi1$YEAR), "to", max(vancksi1$YEAR),
    "\nICBC (Tableau) Years: ", min(vancksi2$`Date Of Loss Year`), "to", max(vancksi2$`Date Of Loss Year`)
)
## 
## ICBC (Direct) Years:  2010 to 2021 
## ICBC (Tableau) Years:  2019 to 2023

ICBC (Direct):

vancksi1 %>%
    group_by(YEAR) %>%
    count

ICBC (Tableau):

vancksi2 %>%
    group_by(`Date Of Loss Year`) %>%
    count

Location

Each record seems to represent collision locations as there are victim counts for each row in both datasets.

Map

tmap_mode("view")
tm_basemap("CartoDB.Positron") +
    tm_shape(
        vancksi2 %>% filter(`Date Of Loss Year` == 2019),
        name = "ICBC (Tableau)"
    ) +
    tm_dots(
        popup.vars = T
    ) +
    tm_shape(
        vancksi1 %>% filter(YEAR == 2019),
        name = "ICBC (Direct)"
    ) +
    tm_dots(
        col = "red",
        popup.vars = T
    )